%%javascript
// JS snippet to save the notebook as html file to display the plotly graphs
require.config({
paths: {
d3: 'https://cdnjs.cloudflare.com/ajax/libs/d3/5.9.2/d3',
jquery: 'https://code.jquery.com/jquery-3.4.1.min',
plotly: 'https://cdn.plot.ly/plotly-latest.min'
},
shim: {
plotly: {
deps: ['d3', 'jquery'],
exports: 'plotly'
}
}
});
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
import plotly.figure_factory as ff
from itertools import cycle
import re
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)
df = pd.read_csv("data/titles.csv")
df.head(1)
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_id | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts300399 | Five Came Back: The Reference Films | SHOW | This collection includes 12 World War II-era propaganda films — many of which are graphic and offensive — discussed in the docuseries "Five Came Back." | 1945 | TV-MA | 48 | ['documentation'] | ['US'] | 1.0 | NaN | NaN | NaN | 0.6 | NaN |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5806 entries, 0 to 5805 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 5806 non-null object 1 title 5805 non-null object 2 type 5806 non-null object 3 description 5788 non-null object 4 release_year 5806 non-null int64 5 age_certification 3196 non-null object 6 runtime 5806 non-null int64 7 genres 5806 non-null object 8 production_countries 5806 non-null object 9 seasons 2047 non-null float64 10 imdb_id 5362 non-null object 11 imdb_score 5283 non-null float64 12 imdb_votes 5267 non-null float64 13 tmdb_popularity 5712 non-null float64 14 tmdb_score 5488 non-null float64 dtypes: float64(5), int64(2), object(8) memory usage: 680.5+ KB
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| release_year | 5806.0 | 2016.013434 | 7.324883 | 1945.000000 | 2015.00000 | 2018.000 | 2020.00000 | 2022.000 |
| runtime | 5806.0 | 77.643989 | 39.474160 | 0.000000 | 44.00000 | 84.000 | 105.00000 | 251.000 |
| seasons | 2047.0 | 2.165608 | 2.636207 | 1.000000 | 1.00000 | 1.000 | 2.00000 | 42.000 |
| imdb_score | 5283.0 | 6.533447 | 1.160932 | 1.500000 | 5.80000 | 6.600 | 7.40000 | 9.600 |
| imdb_votes | 5267.0 | 23407.194988 | 87134.315849 | 5.000000 | 521.00000 | 2279.000 | 10144.00000 | 2268288.000 |
| tmdb_popularity | 5712.0 | 22.525660 | 68.849177 | 0.009442 | 3.15525 | 7.478 | 17.77575 | 1823.374 |
| tmdb_score | 5488.0 | 6.818039 | 1.171560 | 0.500000 | 6.10000 | 6.900 | 7.50000 | 10.000 |
palette = cycle(px.colors.sequential.thermal)
fig = sp.make_subplots(
rows=3, cols=3,
subplot_titles=[
"Yearwise Release Count",
"Runtime",
"IMDB Votes",
"IMDB Rating",
"TMDB Popularity",
"TMDB Score",
"Seasons",
"Age Certification",
"Movie Or Show"],
specs=[[{"type": "histogram"}, {"type": "histogram"}, {"type": "histogram"}],
[{"type": "histogram"}, {"type": "histogram"}, {"type": "histogram"}],
[{"type": "histogram"}, {"type": "pie"}, {"type": "pie"}]]
)
release_year = go.Histogram(
x=df.release_year,
name="Release Year",
marker_color=next(palette),
legendgroup="Release Year",
legendgrouptitle_text="Release Year",
)
runtime = go.Histogram(
x=df.runtime,
nbinsx=int(df.__len__()/50),
name="Runtime",
marker_color=next(palette),
legendgroup="Runtime",
legendgrouptitle_text="Runtime",
)
imdb_votes = go.Histogram(
x=df.imdb_votes,
nbinsx=int(df.__len__()/50),
name="IMDB Votes",
marker_color=next(palette),
legendgroup="IMDB Votes",
legendgrouptitle_text="IMDB Votes",
)
imdb_score = go.Histogram(
x=df.imdb_score,
nbinsx=10,
name="IMDB Score",
marker_color=next(palette),
legendgroup="IMDB Score",
legendgrouptitle_text="IMDB Score",
)
tmdb_popularity = go.Histogram(
x=df.tmdb_popularity,
name="TMDB Popularity",
nbinsx=int(df.__len__()/50),
marker_color=next(palette),
legendgroup="TMDB Popularity",
legendgrouptitle_text="TMDB Popularity",
)
tmdb_score = go.Histogram(
x=df.tmdb_score,
name="TMDB Score",
nbinsx=10,
marker_color=next(palette),
legendgroup="TMDB Score",
legendgrouptitle_text="TMDB Score",
)
seasons = go.Histogram(
x=df.seasons,
name="Seasons",
marker_color=next(palette),
legendgroup="Seasons",
legendgrouptitle_text="Seasons",
)
age_certification_counts = df.age_certification.value_counts()
age_certification_counts["Not Available"] = df.age_certification.isna().sum()
age_certification_dict = age_certification_counts.to_dict()
age_certification = go.Pie(
labels=list(age_certification_dict.keys()),
values=list(age_certification_dict.values()),
name="Age Certification",
hoverinfo="label+value+percent",
marker_colors=[next(palette) for i in range(len(age_certification_dict))],
legendgroup="Age Certification",
legendgrouptitle_text="Age Certification",
)
type_counts = df.type.value_counts().to_dict()
type_ = go.Pie(
labels=list(type_counts.keys()),
values=list(type_counts.values()),
name="Type",
hoverinfo="label+value+percent",
marker_colors=[next(palette) for i in range(len(type_counts))],
legendgroup="Type",
legendgrouptitle_text="Type",
)
fig.add_trace(release_year, row=1, col=1)
fig.update_xaxes(title_text="Release Year", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.add_trace(runtime, row=1, col=2)
fig.update_xaxes(title_text="#Runtime", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.add_trace(imdb_votes, row=1, col=3)
fig.update_xaxes(title_text="No. of IMDB Votes", row=1, col=3)
fig.update_yaxes(title_text="Count", row=1, col=3)
fig.add_trace(imdb_score, row=2, col=1)
fig.update_xaxes(title_text="#IMDB Score", row=2, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)
fig.add_trace(tmdb_popularity, row=2, col=2)
fig.update_xaxes(title_text="#TMDB Popularity", row=2, col=2)
fig.update_yaxes(title_text="Count", row=2, col=2)
fig.add_trace(tmdb_score, row=2, col=3)
fig.update_xaxes(title_text="#TMDB Score", row=2, col=3)
fig.update_yaxes(title_text="Count", row=2, col=3)
fig.add_trace(seasons, row=3, col=1)
fig.update_xaxes(title_text="No. of Seasons", row=3, col=1)
fig.update_yaxes(title_text="Count", row=3, col=1)
fig.add_trace(age_certification, row=3, col=2)
fig.add_trace(type_, row=3, col=3)
fig.update_annotations(font_size=23)
fig.update_layout(
template="plotly",
height=1400,
)
fig.update(
layout_title_text="Distribution of Characteristics of Movies and Series",
layout_title_font_size=30,
layout_title_x=0.5,
layout_paper_bgcolor='rgb(229, 237, 247)',
layout_plot_bgcolor='rgb(229, 237, 247)',
)
fig.show(renderer='notebook')
palette = cycle(px.colors.qualitative.Dark2_r)
fig = sp.make_subplots(
rows=2, cols=3,
subplot_titles=["Runtime",
"Seasons",
"IMDB Score",
"IMDB Votes",
"TMDB Popularity",
"TMDB Score",],
specs=[[{"type": "box"}, {"type": "box"}, {"type": "box"}],
[{"type": "box"}, {"type": "box"}, {"type": "box"}]],
)
runtime_box = go.Box(
y=df.runtime,
name="Runtime",
marker_color=next(palette),
)
seasons_box = go.Box(
y=df.seasons,
name="Seasons",
marker_color=next(palette),
)
imdb_score_box = go.Box(
y=df.imdb_score,
name="IMDB Score",
marker_color=next(palette),
)
imdb_votes_box = go.Box(
y=df.imdb_votes,
name="IMDB Votes",
marker_color=next(palette),
)
tmdb_popularity_box = go.Box(
y=df.tmdb_popularity,
name="TMDB Popularity",
marker_color=next(palette),
)
tmdb_score_box = go.Box(
y=df.tmdb_score,
name="TMDB Score",
marker_color=next(palette),
)
fig.add_trace(runtime_box, row=1, col=1)
fig.update_xaxes(title_text="Runtime", row=1, col=1)
fig.add_trace(seasons_box, row=1, col=2)
fig.update_xaxes(title_text="No. of Seasons", row=1, col=2)
fig.add_trace(imdb_score_box, row=1, col=3)
fig.update_xaxes(title_text="IMDB Score", row=1, col=3)
fig.add_trace(imdb_votes_box, row=2, col=1)
fig.update_xaxes(title_text="No. of IMDB Votes", row=2, col=1)
fig.add_trace(tmdb_popularity_box, row=2, col=2)
fig.update_xaxes(title_text="TMDB Popularity", row=2, col=2)
fig.add_trace(tmdb_score_box, row=2, col=3)
fig.update_xaxes(title_text="TMDB Score", row=2, col=3)
fig.update_layout(template="plotly", height=1080,)
fig.update_annotations(font_size=23)
fig.update(
layout_title_text="Box Plots of Characteristics of Movies and Series",
layout_title_font_size=30,
layout_title_x=0.5,
layout_paper_bgcolor='rgb(229, 237, 247)',
layout_plot_bgcolor='rgb(229, 237, 247)',
)
fig.show(renderer='notebook')
Generating Feature-Columns for every genre
df["genres"] = df["genres"].apply(lambda x: re.findall("\w+", x))
genres = list(df["genres"].values)
genres = list(set([item for sublist in genres for item in sublist]))
for i, genre in enumerate(genres):
df[genre] = df.genres.apply(lambda x: 1 if genre in x else 0).astype(int)
print("Number of Genres: ", len(genres))
print("Genres:", genres)
Number of Genres: 19 Genres: ['crime', 'european', 'action', 'animation', 'reality', 'thriller', 'war', 'history', 'romance', 'fantasy', 'scifi', 'drama', 'western', 'comedy', 'music', 'family', 'sport', 'horror', 'documentation']
genre_movie_dict = {}
for genre in genres:
genre_movie_dict[genre] = df.query("type == 'MOVIE'")[genre].sum()
genre_movie_dict = dict(sorted(genre_movie_dict.items(), key=lambda x: x[0]))
genre_series_dict = {}
for genre in genres:
genre_series_dict[genre] = df.query("type == 'SHOW'")[genre].sum()
genre_series_dict = dict(sorted(genre_series_dict.items(), key=lambda x: x[0]))
fig = sp.make_subplots(
rows=2,
cols=1,
subplot_titles=["Movies", "Series"],
)
genre_movie_count = go.Bar(
x=list(genre_movie_dict.keys()),
y=list(genre_movie_dict.values()),
marker=dict(color=list(genre_movie_dict.values()),
colorscale=px.colors.qualitative.Dark2),
name="Movies",
)
genre_series_count = go.Bar(
x=list(genre_series_dict.keys()),
y=list(genre_series_dict.values()),
marker=dict(color=list(genre_series_dict.values()),
colorscale=px.colors.qualitative.Dark2),
name="Series",
)
fig.add_trace(genre_movie_count, row=1, col=1)
fig.update_xaxes(title_text="Genres", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.add_trace(genre_series_count, row=2, col=1)
fig.update_xaxes(title_text="Genres", row=2, col=1)
fig.update_yaxes(title_text="Count", row=2, col=1)
fig.update(
layout_title_text="Genre Distribution based on No. of Movies and Shows",
layout_title_font_size=30,
layout_title_x=0.5,
layout_template="plotly",
layout_showlegend=False,
layout_height=800,
layout_paper_bgcolor='rgb(229, 237, 247)',
layout_plot_bgcolor='rgb(229, 237, 247)',
)
fig.update_annotations(font_size=18)
fig.show(renderer='notebook')
genre_movies_popularity_dict = {}
for i, genre in enumerate(genres):
genre_movies_popularity_dict[genre] = df.query("type == 'MOVIE'").groupby(genre)["imdb_votes"].sum().sort_index().__getitem__(1)
genre_movies_popularity_dict = dict(sorted(genre_movies_popularity_dict.items(), key=lambda x: x[0]))
genre_series_popularity_dict = {}
for i, genre in enumerate(genres):
genre_series_popularity_dict[genre] = df.query("type == 'SHOW'").groupby(genre)["imdb_votes"].sum().sort_index().__getitem__(1)
genre_series_popularity_dict = dict(sorted(genre_series_popularity_dict.items(), key=lambda x: x[0]))
fig = sp.make_subplots(
rows=2,
cols=1,
subplot_titles=["Movies", "Series"],
)
genre_movies_pop = go.Bar(
x=list(genre_movies_popularity_dict.keys()),
y=list(genre_movies_popularity_dict.values()),
marker=dict(color=list(genre_movies_popularity_dict.values()),
colorscale=px.colors.qualitative.Dark2),
hoverinfo="x+y",
)
genre_series_pop = go.Bar(
x=list(genre_series_popularity_dict.keys()),
y=list(genre_series_popularity_dict.values()),
marker=dict(color=list(genre_series_popularity_dict.values()),
colorscale=px.colors.qualitative.Dark2),
hoverinfo="x+y",
)
fig.add_trace(genre_movies_pop, row=1, col=1)
fig.update_xaxes(title_text="Genre", row=1, col=1)
fig.update_yaxes(title_text="IMDB Votes", row=1, col=1)
fig.update
fig.add_trace(genre_series_pop, row=2, col=1)
fig.update_xaxes(title_text="Genre", row=2, col=1)
fig.update_yaxes(title_text="IMDB Votes", row=2, col=1)
fig.update(
layout_title_text="Genre Distribution based on IMDB Votes",
layout_title_font_size=30,
layout_title_x=0.5,
layout_template="plotly",
layout_showlegend=False,
layout_height=800,
layout_paper_bgcolor='rgb(229, 237, 247)',
layout_plot_bgcolor='rgb(229, 237, 247)',
)
fig.update_annotations(font_size=18)
fig.show(renderer='notebook')
palette = cycle(px.colors.qualitative.Dark2)
fig = go.Figure()
for i, genre in enumerate(sorted(genres)):
temp = df[df[genre] == 1]
fig.add_trace(
go.Box(
y=temp['imdb_score'],
name=genre,
marker_color=next(palette),
marker_size=5,
line_width=1,
hovertemplate="<b>%{y:.2f}</b>"+f"<br>{genre}<br>Count-{len(temp)}<extra></extra>",
)
)
fig.update_layout(
title="IMDB Score Box Distribution by Genre",
title_font_size=30,
title_x=0.5,
yaxis_title="IMDB Score",
xaxis_title="Genre",
template="plotly",
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
showlegend=False,
paper_bgcolor='rgb(229, 237, 247)',
plot_bgcolor='rgb(229, 237, 247)',
)
fig.show(renderer='notebook')
fig = ff.create_distplot(
[df[(df[genre] == 1) & (df['imdb_score'].notna())]['imdb_score'] for genre in sorted(genres)],
sorted(genres),
show_hist=False,
show_rug=False,
)
fig.update_layout(
title="IMDB Score Distribution by Genre",
title_font_size=30,
title_x=0.5,
xaxis_title="IMDB Score",
template="plotly",
paper_bgcolor='rgb(229, 237, 247)',
plot_bgcolor='rgb(229, 237, 247)',
legend_title="Genre",
)
fig.show(renderer='notebook')
df["production_countries"] = df["production_countries"].apply(lambda x: re.findall("\w+", x))
df["production_countries"] = df["production_countries"].apply(lambda x: ["LB"] if str(x).__contains__("Lebanon") else x)
df["production_countries"] = df["production_countries"].apply(lambda x: [] if str(x).__contains__("XX") else x)
production_countries = list(df["production_countries"].values)
production_countries = list(set([item for sublist in production_countries for item in sublist]))
df["main_production_country_alpha_2"] = df.production_countries.apply(lambda x: x[0] if x else None)
country_alpha = pd.read_json("data/countries.json")[["name", "alpha_2", "alpha_3"]]
country_alpha = dict(zip(country_alpha.alpha_2, country_alpha.alpha_3))
df["main_production_country_alpha_3"] = df.main_production_country_alpha_2.apply(lambda x: country_alpha[x] if x else None)
fig = go.Figure()
map_plot = go.Choropleth(
locations=df.main_production_country_alpha_3.value_counts().index,
z=df.main_production_country_alpha_3.value_counts().values,
colorscale="Reds",
autocolorscale=False,
colorbar_title="No. of Movies",
)
fig.update_layout(
title_text="Production Countries",
title_font_size=30,
height=680,
title_x=0.5,
geo=dict(
showframe=False,
showcoastlines=False,
projection_type='equirectangular'
),
)
fig.add_trace(map_plot)
distrib = df['imdb_score'].map(lambda x: f"{int(np.nan_to_num(x)*10//10)}-{int((np.nan_to_num(x)*10//10)+1)}")
df.insert(12, 'imdb_score_range', distrib)
score_range_dict = {}
for i, genre in enumerate(sorted(genres)):
score_range_dict[genre] = df.groupby(genre)['imdb_score_range'].value_counts().__getitem__(1).to_dict()
fig = sp.make_subplots(
rows=4,
cols=5,
subplot_titles=sorted(genres),
specs=[[{'type': 'table'}]*5]*4,
horizontal_spacing=0.01,
vertical_spacing=0.05,
)
for i, (key, value) in enumerate(score_range_dict.items()):
fig.append_trace(
go.Table(
header=dict(
values=["IMDB Score Range", "Count"],
align="center",
),
cells=dict(
values=[list(value.keys()), list(value.values())],
align="center",
)
),
row=i%4+1,
col=i%5+1,
)
fig.update_layout(
title_text="IMDB Score Distribution by Genre",
title_font_size=30,
title_x=0.5,
height=1000,
autosize=True,
)
fig.show(renderer='notebook')
df.query("type == 'MOVIE'")[['release_year',
'title',
'type',
'runtime',
'imdb_score',
'imdb_votes',
'genres']]\
.sort_values(by=["imdb_votes", "imdb_score"],
ascending=False)\
.head(5)\
.reset_index(drop=True)
| release_year | title | type | runtime | imdb_score | imdb_votes | genres | |
|---|---|---|---|---|---|---|---|
| 0 | 2010 | Inception | MOVIE | 148 | 8.8 | 2268288.0 | [scifi, music, thriller, action] |
| 1 | 1994 | Forrest Gump | MOVIE | 142 | 8.8 | 1994599.0 | [drama, romance, comedy] |
| 2 | 2012 | Django Unchained | MOVIE | 165 | 8.4 | 1472668.0 | [western, drama] |
| 3 | 1998 | Saving Private Ryan | MOVIE | 169 | 8.6 | 1346020.0 | [drama, war] |
| 4 | 1976 | Taxi Driver | MOVIE | 113 | 8.3 | 795222.0 | [crime, drama] |
df.query("type == 'SHOW'")[['release_year',
'title',
'type',
'runtime',
'imdb_score',
'imdb_votes',
'genres',
]]\
.sort_values(by=["imdb_votes", "imdb_score"],
ascending=False)\
.head(5)\
.reset_index(drop=True)
| release_year | title | type | runtime | imdb_score | imdb_votes | genres | |
|---|---|---|---|---|---|---|---|
| 0 | 2008 | Breaking Bad | SHOW | 48 | 9.5 | 1727694.0 | [drama, thriller, crime] |
| 1 | 2016 | Stranger Things | SHOW | 52 | 8.7 | 989090.0 | [scifi, drama, fantasy, horror, thriller] |
| 2 | 2010 | The Walking Dead | SHOW | 46 | 8.2 | 945125.0 | [action, drama, scifi, thriller, horror] |
| 3 | 2011 | Black Mirror | SHOW | 59 | 8.8 | 515577.0 | [scifi, thriller, drama, european] |
| 4 | 2013 | House of Cards | SHOW | 52 | 8.7 | 494092.0 | [drama] |
df.query("type == 'SHOW' and main_production_country_alpha_3 == 'USA' and age_certification == 'TV-PG'")\
.sort_values(by=["imdb_votes", "imdb_score"], ascending=False)\
.reset_index(drop=True)\
.head(1)\
[["id", "title", "release_year", "genres", "seasons", "runtime", "imdb_score", "imdb_votes"]]
| id | title | release_year | genres | seasons | runtime | imdb_score | imdb_votes | |
|---|---|---|---|---|---|---|---|---|
| 0 | ts20681 | Seinfeld | 1989 | [comedy] | 9.0 | 24 | 8.9 | 302700.0 |
best_by_genre = pd.DataFrame(columns=df.columns.tolist() + ["selected_genre"])
for i, genre in enumerate(sorted(genres)):
best_genre_data = df.query(f"{genre} == 1").sort_values(by=["imdb_votes", "imdb_score"], ascending=False).reset_index().head(1)
best_genre_data["selected_genre"] = genre
best_by_genre = pd.concat([best_by_genre, best_genre_data], ignore_index=True).reset_index(drop=True)
best_by_genre[['release_year', 'title', 'selected_genre', 'imdb_score']]
| release_year | title | selected_genre | imdb_score | |
|---|---|---|---|---|
| 0 | 2010 | Inception | action | 8.8 |
| 1 | 2010 | How to Train Your Dragon | animation | 8.1 |
| 2 | 1994 | Forrest Gump | comedy | 8.8 |
| 3 | 2008 | Breaking Bad | crime | 9.5 |
| 4 | 2002 | Road to Perdition | documentation | 7.7 |
| 5 | 1994 | Forrest Gump | drama | 8.8 |
| 6 | 2006 | Casino Royale | european | 8.0 |
| 7 | 2010 | How to Train Your Dragon | family | 8.1 |
| 8 | 2016 | Stranger Things | fantasy | 8.7 |
| 9 | 2017 | Dunkirk | history | 7.8 |
| 10 | 2016 | Stranger Things | horror | 8.7 |
| 11 | 2010 | Inception | music | 8.8 |
| 12 | 2017 | GLOW | reality | 8.0 |
| 13 | 1994 | Forrest Gump | romance | 8.8 |
| 14 | 2010 | Inception | scifi | 8.8 |
| 15 | 2013 | Rush | sport | 8.1 |
| 16 | 2010 | Inception | thriller | 8.8 |
| 17 | 1998 | Saving Private Ryan | war | 8.6 |
| 18 | 2012 | Django Unchained | western | 8.4 |
gb = df.query("type == 'SHOW'")\
.sort_values(by=["release_year", "imdb_score"], ascending=[True, False])\
.groupby("release_year")
gb.first()[["title", "imdb_score"]]
| title | imdb_score | |
|---|---|---|
| release_year | ||
| 1945 | Five Came Back: The Reference Films | NaN |
| 1969 | Monty Python's Flying Circus | 8.8 |
| 1972 | Monty Python's Fliegender Zirkus | 8.1 |
| 1981 | Danger Mouse | 7.4 |
| 1982 | Knight Rider | 6.9 |
| 1983 | Wheel of Fortune | 6.7 |
| 1984 | Thomas & Friends | 6.5 |
| 1987 | Fireman Sam | 6.1 |
| 1988 | High Risk | 3.8 |
| 1989 | Seinfeld | 8.9 |
| 1991 | My First Errand | NaN |
| 1992 | Barney & Friends | 3.8 |
| 1993 | Star Trek: Deep Space Nine | 8.1 |
| 1994 | The Magic School Bus | 7.8 |
| 1995 | Neon Genesis Evangelion | 8.5 |
| 1996 | Moesha | 5.7 |
| 1997 | Stargate SG-1 | 8.4 |
| 1998 | Cowboy Bebop | 8.9 |
| 1999 | One Piece | 8.8 |
| 2000 | Okupas | 9.0 |
| 2001 | Trailer Park Boys | 8.6 |
| 2002 | Still Game | 8.9 |
| 2003 | Chappelle's Show | 8.8 |
| 2004 | The Staircase | 7.8 |
| 2005 | Khawatir | 9.6 |
| 2006 | DEATH NOTE | 9.0 |
| 2007 | Heartland | 8.4 |
| 2008 | Breaking Bad | 9.5 |
| 2009 | Midnight Diner | 8.6 |
| 2010 | Downton Abbey | 8.7 |
| 2011 | Hunter x Hunter | 9.0 |
| 2012 | Call the Midwife | 8.5 |
| 2013 | Attack on Titan | 9.0 |
| 2014 | Raja, Rasoi Aur Anya Kahaniyaan | 9.0 |
| 2015 | Reply 1988 | 9.2 |
| 2016 | Leah Remini: Scientology and the Aftermath | 9.0 |
| 2017 | Crazy Delicious | 8.9 |
| 2018 | #ABtalks | 9.6 |
| 2019 | Our Planet | 9.3 |
| 2020 | The Last Dance | 9.1 |
| 2021 | Arcane | 9.1 |
| 2022 | Who Rules The World | 9.2 |
df[(df['drama'] == 1) &
(df['thriller'] == 1) &
(df['main_production_country_alpha_3'] == 'IND')
].sort_values(by=["imdb_score"], ascending=False)\
.reset_index(drop=True)\
.head(5)\
.loc[:, ["title", "release_year", "imdb_score"]]
| title | release_year | imdb_score | |
|---|---|---|---|
| 0 | Sacred Games | 2018 | 8.6 |
| 1 | Super Deluxe | 2019 | 8.4 |
| 2 | Single Slipper Size - 7 | 2019 | 8.4 |
| 3 | Article 15 | 2019 | 8.2 |
| 4 | Andhadhun | 2018 | 8.2 |
df[(df['action'] == 1) &
(df['comedy'] == 1) &
(df['main_production_country_alpha_3'] == 'GBR')
]\
.sort_values(by=["imdb_score"], ascending=False)\
.reset_index(drop=True)\
.head(5)\
.loc[:, ["title", "release_year", "imdb_score"]]
| title | release_year | imdb_score | |
|---|---|---|---|
| 0 | Octonauts | 2010 | 7.6 |
| 1 | Danger Mouse | 2015 | 7.2 |
| 2 | Sugar Rush | 2019 | 6.7 |
| 3 | Thomas & Friends | 1984 | 6.5 |
| 4 | David Brent: Life on the Road | 2016 | 6.3 |
total_produced_countries = df.groupby("main_production_country_alpha_3")['title'].count().to_dict()
country_alpha = pd.read_json("data/countries.json")[["name", "alpha_2", "alpha_3"]]
country_alpha3_name = country_alpha.set_index("alpha_3")["name"].to_dict()
count = 0
for (key, value) in total_produced_countries.items():
if value < 5:
count += 1
if count % 4 == 0:
print(f"[{count}] {country_alpha3_name[key]} ({key})", end="\n")
else:
print(f"[{count}] {country_alpha3_name[key]} ({key})", end="\t")
[1] Afghanistan (AFG) [2] Angola (AGO) [3] Bangladesh (BGD) [4] Belarus (BLR) [5] Switzerland (CHE) [6] Cameroon (CMR) [7] Congo (the Democratic Republic of the) (COD) [8] Cuba (CUB) [9] Algeria (DZA) [10] Finland (FIN) [11] Georgia (GEO) [12] Ghana (GHA) [13] Greenland (GRL) [14] Guatemala (GTM) [15] Croatia (HRV) [16] Hungary (HUN) [17] British Indian Ocean Territory (IOT) [18] Iraq (IRQ) [19] Jordan (JOR) [20] Kenya (KEN) [21] Kyrgyzstan (KGZ) [22] Cambodia (KHM) [23] Lithuania (LTU) [24] Luxembourg (LUX) [25] Morocco (MAR) [26] Mozambique (MOZ) [27] Mauritius (MUS) [28] Malawi (MWI) [29] Namibia (NAM) [30] Pakistan (PAK) [31] Puerto Rico (PRI) [32] Portugal (PRT) [33] Paraguay (PRY) [34] Senegal (SEN) [35] Serbia (SRB) [36] Slovakia (SVK) [37] Syria (SYR) [38] Tanzania, the United Republic of (TZA) [39] Ukraine (UKR) [40] Uruguay (URY) [41] Venezuela (Bolivarian Republic of) (VEN) [42] Viet Nam (VNM) [43] Zimbabwe (ZWE)
longest_runtime = pd.DataFrame(columns=df.columns.tolist() + ["selected_genre"])
for i, genre in enumerate(sorted(genres)):
temp = df[df[genre] == 1].sort_values(by=['runtime'], ascending=False).reset_index(drop=True)
first = temp.groupby(genre).first()
first["selected_genre"] = genre
longest_runtime = pd.concat([longest_runtime, first], ignore_index=True).reset_index(drop=True)
longest_runtime[['title', 'release_year', 'runtime', 'selected_genre']]
| title | release_year | runtime | selected_genre | |
|---|---|---|---|---|
| 0 | Jodhaa Akbar | 2008 | 213 | action |
| 1 | Mobile Suit Gundam III: Encounters in Space | 1982 | 144 | animation |
| 2 | The School of Mischief | 1973 | 251 | comedy |
| 3 | Bonnie & Clyde | 2013 | 240 | crime |
| 4 | A Lion in the House | 2006 | 225 | documentation |
| 5 | Bonnie & Clyde | 2013 | 240 | drama |
| 6 | Bonnie & Clyde | 2013 | 240 | european |
| 7 | 4K Fireplace | 2015 | 181 | family |
| 8 | Hum Saath Saath Hain | 1999 | 177 | fantasy |
| 9 | Jodhaa Akbar | 2008 | 213 | history |
| 10 | Apocalypse Now Redux | 2001 | 196 | horror |
| 11 | No Direction Home: Bob Dylan | 2005 | 208 | music |
| 12 | 4K Fireplace | 2015 | 181 | reality |
| 13 | Lagaan: Once Upon a Time in India | 2001 | 224 | romance |
| 14 | Blade Runner 2049 | 2017 | 164 | scifi |
| 15 | Lagaan: Once Upon a Time in India | 2001 | 224 | sport |
| 16 | The Irishman | 2019 | 209 | thriller |
| 17 | Jodhaa Akbar | 2008 | 213 | war |
| 18 | The Hateful Eight: Extended Version | 2019 | 199 | western |
df.sort_values(by=['seasons'], ascending=False)\
.reset_index(drop=True)\
.head(5)\
.loc[:, ["title", "release_year", "seasons", "genres"]]
| title | release_year | seasons | genres | |
|---|---|---|---|---|
| 0 | Survivor | 2000 | 42.0 | [reality] |
| 1 | Wheel of Fortune | 1983 | 39.0 | [family] |
| 2 | The Challenge | 1998 | 37.0 | [reality, comedy, drama, scifi] |
| 3 | Power Rangers | 1993 | 29.0 | [scifi, action, fantasy, family] |
| 4 | Pokémon | 1997 | 24.0 | [scifi, action, comedy, fantasy, animation, family] |
!jupyter nbconvert --to html analysis.ipynb
[NbConvertApp] Converting notebook analysis.ipynb to html
c:\users\keshav\anaconda3\envs\pytorch_env\lib\site-packages\nbconvert\filters\widgetsdatatypefilter.py:69: UserWarning: Your element with mimetype(s) dict_keys(['application/vnd.plotly.v1+json']) is not able to be represented.
warn("Your element with mimetype(s) {mimetypes}"
[NbConvertApp] Writing 27130767 bytes to analysis.html